import asyncio
import time
import aiohttp
from lxml import etree
from concurrent.futures import ThreadPoolExecutor, as_completed
import pymysql
import openpyxl
from urllib.parse import quote
from proxy import proxy_pool
import random
from User_Agents import user_agents
from queue import Queue


# 爬虫 主体
async def craw(session, url, brand):
    headers = {
        'User-Agent': random.choice(user_agents),
        'cookie':'',
        'referer': url
    }
    try:
        # 设置代理
        async with session.get(url=url, headers=headers, proxy=random.choice(proxy_list)) as response:
        #async with session.get(url=url, headers=headers) as response:
            res = await response.text()  # 拿到返回的response字符串
            # 以 len(response)的大小 判断是为否为我们想要的 页面源码
            if len(res) < 300:
                raise Exception  # 为真 说明不是我们想要的源码 直接raise异常 跳到except
            else:
                return (brand, res)  # 为假 说明是我们想要的数据 直接return

    # 被反爬后 将url加入到 fail_url列表 然后return None
    except:
        fail_url.append(brand + '-' + url)
        return None


async def asy_main(urls, html_queue):
    sem = asyncio.Semaphore(10)  # 设置信号量
    print('asy craw ...')  # 标识爬虫开始
    async with sem:
        async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(ssl=False)) as session:
            craw_tasks = [asyncio.create_task(craw(session, i.split('-', 1)[1], i.split('-', 1)[0])) for i in urls]
            done, pending = await asyncio.wait(craw_tasks)
    print('asy craw COMEPLETE ...')
    html_queue.put(done)


def parse(tup):
    list = []
    html = tup[1]
    brand = tup[0]
    tree = etree.HTML(html)
    names = tree.xpath('/html/body//li[@class="gl-item"]')
    for i in names:
        dict = {}
        if i.xpath('./@data-spu')[0]:
            name = ''.join(i.xpath('./div//div[contains(@class,"p-name p-name-type-2")]/a/em/text()'))
            dict['name'] = name
            list.append(dict)
        else:
            pass
    print(len(list))
    return (brand, list)


def multi_main(html_queue):
    print('multi parse ...')
    # 连接 mysql
    conn = pymysql.connect(host='192.168.204.128',
                           port=3306,
                           user='Coisini',
                           password='Wxylkxy0415.@',
                           db='text',
                           charset='utf8')
    # 开启 线程池 并且 解析
    with ThreadPoolExecutor() as pool:
        futures = []
        for i in range(1, html_queue.qsize() + 1):
            done = html_queue.get()
            for html in done:
                if html.result():
                    future = pool.submit(parse, (html.result()[0], html.result()[1]))
                    futures.append(future)
                else:
                    pass
    for future in as_completed(futures):  # 以as_completed遍历futures 并且执行 sql语句
        tup = future.result()
        brand = tup[0]
        list = tup[1]
        for dict in list:
            cursor = conn.cursor()
            sql = 'insert into jd(brand,name) values(%s,%s)'
            cursor.execute(sql, (brand, dict['name']))
            conn.commit()
            cursor.close()

    print('multi parse COMPLETE ...')
    # 关闭连接
    conn.close()


# 返回urls  格式为   brand-url  （str 类型）
def geturls():
    print('GET URL ...')
    urls = []
    wb = openpyxl.load_workbook('url_info.xlsx')
    ws = wb.active
    scope = ws.iter_rows(min_row=2, max_row=2, min_col=1, max_col=3)
    for rows in scope:
        for page in range(1, 2 * int(rows[1].value) + 1, 2):
            a = rows[0].value + '-' + str(rows[2].value + f'&cid2=671&page={page}')
            urls.append(a)
        for page in range(2, 2 * int(rows[1].value) + 1, 2):
            b = rows[0].value + '-' + f'https://search.jd.com/s_new.php?keyword=%E7%AC%94%E8%AE%B0%E6%9C%AC%E7' \
                                      f'%94%B5%E8%84%91&ev=exbrand_{quote(rows[0].value)}%5E&cid3=672&cid2=671&page={page}&s=1&scrolling=y'
            urls.append(b)
    print('CONMPLETE ...')
    return urls


if __name__ == '__main__':
    fail_url = []  # 定义一个全局列表
    urls = geturls()  # 拿到urls
    print('共', len(urls), '页数据')  # 输出 urls的数量
    user_agents = user_agents()  # 设置随机UA
    proxy_list = proxy_pool() # 实例化代理池
    html_queue = Queue()  # 实例化一个queue 用来缓冲

    # 基于协程开始爬虫
    loop = asyncio.get_event_loop()
    loop.run_until_complete(asy_main(urls, html_queue))

    while True:
        if len(fail_url):
            fail_url_ = list(set(fail_url))  # 去重
            time.sleep(3)
            print(f'未成功的url---{len(fail_url_)}')
            loop = asyncio.get_event_loop()
            loop.run_until_complete(asy_main(fail_url_, html_queue))
        else:
            print('爬取全部完成!!!')
            break

    # 基于多进程 解析html 并且写入mysql
    multi_main(html_queue)
